version 16.1
clear all
cd "MYPATH\derived\make_master_dataset"
adopath + ../../ado/

cap log close

preliminaries


log using "build.log", replace
	
cap mkdir "$TEMP\derived\make_master_dataset"


set seed 921

program main
    clean_mfr
    clean_kub
    use "$TEMP\derived\make_master_dataset\kub_clean.dta", clear
    joinby lopnr using "$TEMP\derived\make_master_dataset\mfr_clean.dta", unmatched(both) _merge(mother_status)
    label def mother_status 3 "Mother in MFR and has test in KUB" ///
							2 "Mother in MFR, no test in KUB" ///
                          1 "Mother not in MFR"
    label val mother_status mother_status
	* mother_status == 1 implies not in MFR
	gen in_MFR = 1 * (mother_status != 1) 
	codebook mother_status 

	* Drop the births in MFR that don't have matching lopnr in kub testing 
	drop if mother_status == 2
	
	* Save Tests from KUB that don't have matching lopnr in MFR separately to append back later, then drop
	preserve 
	keep if mother_status == 1
	save "$TEMP\derived\make_master_dataset\mom_kub_only.dta", replace
	restore
	drop if mother_status == 1

	di "Match tests and pregnancies"
    match_preg_test
	
	di "Append tests from KUB that don'thave matching lopnr in MFR'"
	append using "$TEMP\derived\make_master_dataset\mom_kub_only.dta"
	
	di "Assign pregnancy ids"
	assign_unique_pregnancy_ids
	
	di "KUB risk score vars"
	bys pregnancy: egen fetus_risk1 = min(kombinerad_risk_t13_18)
	by pregnancy: egen fetus_risk2 = min(kombinerad_risk_t21)
	gen fetus_risk = min(fetus_risk1, fetus_risk2)
	replace fetus_risk = fetus_risk1 if mi(fetus_risk2) 
	replace fetus_risk = fetus_risk2 if mi(fetus_risk1)
	
	di "Label variables"
	label_vars
	 
    save "$DATA\master_dataset", replace
end

program clean_mfr

	use lopnr childid bfoddat birth_flag nonsingleton num_birth_records grdbs grvfv grdfv ///
	  bpsmdat bpuldat bdiag* ///
		  using "MYPATH\MYPATH.dta", clear
    format lopnr  %15.0f
	
	di "clean bpuldat/bpsmdat"
	foreach var in bpul bpsm {
		tostring `var'dat, replace
		gen `var'dat_year = substr(`var'dat, 1, 4)
		gen `var'dat_month = substr(`var'dat, 5, 2) if length(`var'dat) > 4
		gen `var'dat_day = substr(`var'dat, 7, 2) if length(`var'dat) >5
		destring `var'dat_year `var'dat_month `var'dat_day `var'dat, replace
		replace `var'dat = mdy(`var'dat_month, `var'dat_day, `var'dat_year)
		format `var'dat %td
	}
	
	di "Extract birth date"
	gen bfoddat_month = mod(bfoddat, 100)
	gen bfoddat_yr = floor(bfoddat/100)
	
	di "Calculate birth date = expected due date - 280 + gest age at birth"
	destring grdfv, replace
	gen gest_age_birth = grdbs
	replace gest_age_birth = grvfv*7 + grdfv if mi(grdbs)
	gen birth_date = bpuldat - 280 + gest_age_birth
	gen birth_date_flag = 1 if !mi(birth_date)
	replace birth_date = bpsmdat - 280 + gest_age_birth if mi(birth_date)
	replace birth_date_flag = 2 if !mi(birth_date) & mi(birth_date_flag)
	replace birth_date = mdy(bfoddat_month, 15, bfoddat_yr) if mi(birth_date)
	replace birth_date_flag = 3 if mi(birth_date_flag)
	format birth_date %td
	label define bdflag 1 "bpuldat - 280 + gest_age_birth" 2 "bpsmdat - 280 + gest_age_birth" ///
	  3 "15th of bfoddat"
	label val birth_date_flag bdflag
	
	gen impute_birth_month_discrep = 99
		gen b_m = month(birth_date)
		replace impute_birth_month_discrep = 0 if birth_date_flag == 1 | birth_date_flag == 2
		replace impute_birth_month_discrep = 1 if b_m != bfoddat_month & birth_date_flag == 1
		replace impute_birth_month_discrep = 1 if b_m != bfoddat_month & birth_date_flag == 2
		label define bmdisc 0 "No discrepancy" 1 "Discrepancy" 99 "Birth date from bfoddat"
		label val impute_birth_month_discrep bmdisc
		label variable impute_birth_month_discrep "flag whether imputed birth date month == bfoddat month"
	tab impute_birth_month_discrep, mi
		
    di "Baby's diagnoses" 
	gen baby_diagnosis_birth = ""

	forval i = 1/12 {
		replace baby_diagnosis_birth = baby_diagnosis_birth + "" + bdiag`i' if !mi(bdiag`i')
	}
	drop bdiag* 
	
	di "Pregnancy identifier"
	* in old code, no stable
	sort bfoddat
	gen preg_id_mfr = _n 
    
	di "Drop non-singletons"
	tab nonsingleton
	drop if nonsingleton == 2
    save "$TEMP\derived\make_master_dataset\mfr_clean.dta", replace
end

program clean_kub
    di "Cleaning [kub.dta]..."
    use "${DATA}\KUB_data\cleaned_integrated_kub.dta", clear
    foreach date in sm_datum et_datum bp_enligt_et undersokningsdatum  {
        extract_date, var(`date')
    }
    gen bpsm_kub = sm_datum + 280
    extract_date, var(bpsm_kub)
    format lopnr %15.0f
    drop if missing(foster_risk)
	
	di "missing both risk scores"
	di "-- counts for 2011-2019"
	count if undersokningsdatum >= td(01jan2011) & undersokningsdatum < td(01jan2020)
	count if undersokningsdatum >= td(01jan2011) & undersokningsdatum < td(01jan2020) & ///
	  mi(kombinerad_risk_t13_18) & mi(kombinerad_risk_t21)
	drop if mi(kombinerad_risk_t13_18) & mi(kombinerad_risk_t21)
    isid lopnr undersokningsdatum foster_risk
	
    di "Drop tests performed before 01feb2009"
    di "Obs:"
    count if undersokningsdatum < td(01feb2009)
    drop if undersokningsdatum < td(01feb2009)
    
    di "Generating [test_id]..."
    egen test_id = group(lopnr undersokningsdatum foster_risk)
    order test_id, first
    recode_county, from(lan)
    
    gen preg_end_date_kub = bp_enligt_et
    replace preg_end_date_kub = bpsm_kub if mi(preg_end_date_kub)
   
    di "Generating indicators"
    label def yn 1 "Yes" 0 "No" .p "Don't know, var populated" .d "Don't know, var missing"
    foreach var of varlist *utfort ivf_graviditet rokare {
        tempvar temp
        gen temp = cond(strpos(`var', "Ja") > 0, 1, ///
                    cond(strpos(`var', "Nej") > 0, 0, ///
                    cond(strpos(`var', "Vet ej") > 0, .p, .d)))
        drop `var'
        gen `var' = temp
        label val `var' yn
        drop temp
    }
	
	di "Drop non-singleton tests"
	bysort lopnr undersokningsdatum: egen num_fetuses_kub = max(foster_risk)
	tab num_fetuses_kub
	drop if num_fetuses_kub > 1
	drop num_fetuses_kub
   
    save "$TEMP\derived\make_master_dataset\kub_clean.dta", replace
end

    program extract_date
        syntax, var(varname)
        di "Extracting date in [`var']..."
        quietly {
            local vartype: type `var'
            tempvar orig 
            if strpos("`vartype'", "str") > 0 {
                clonevar `orig' = `var'
                drop `var'
                noi di "Variable format is string YYYY-MM-DD"
                noi di "Processing..."
                gen `var' = date(`orig', "YMD")
            }
            else if "`vartype'" == "double" {
                clonevar `orig' = `var'
                drop `var'
                noi di "Variable format is datetime"
                noi di "Processing..."
                gen `var' = dofc(`orig')
            }
            format `var' %tdCCYY-NN-DD
            gen `var'_yr = year(`var')
            gen `var'_mo = month(`var')
        }
    end

    program recode_county
        syntax, from(varname)
        di "Clean lan"
        quietly {
            replace `from' = "1" if strpos(`from', "Stockholm") > 0
            replace `from' = "3"  if strpos(`from', "Uppsala") > 0
            replace `from' = "4" if strpos(`from', "Södermanland") > 0 | strpos(`from', "Sörmland") > 0
            replace `from' = "5" if strpos(`from', "Östergötland") > 0
            replace `from' = "6" if strpos(`from', "Jönköping") > 0
            replace `from' = "7" if strpos(`from', "Kronoberg") > 0
            replace `from' = "8" if strpos(`from', "Kalmar") > 0
            replace `from' = "9" if strpos(`from', "Gotland") > 0
            replace `from' = "10" if strpos(`from', "Blekinge") > 0 
            replace `from' = "12" if strpos(`from', "Skåne") > 0 
            replace `from' = "13" if strpos(`from', "Halland") > 0 
            replace `from' = "14" if strpos(`from', "Västra Götaland") > 0 
            replace `from' = "17" if strpos(`from', "Värmland") > 0 
            replace `from' = "18" if strpos(`from', "Örebro") > 0 
            replace `from' = "19" if strpos(`from', "Västmanland") > 0
            replace `from' = "20" if strpos(`from', "Dalarna") > 0 
            replace `from' = "21" if strpos(`from', "Gävleborg") > 0 
            replace `from' = "22" if strpos(`from', "Västernorrland") > 0 
            replace `from' = "23" if strpos(`from', "Jämtland") > 0 
            replace `from' = "24" if strpos(`from', "Västerbotten") > 0 
            replace `from' = "25" if strpos(`from', "Norrbotten") > 0
        }
        destring `from', replace
        
        label define lan 1 "Stockholm" 3 "Uppsala" 4 "Södermanland" 5 "Östergötland" 6 "Jönköping" 7 "Kronoberg" 8 "Kalmar" 9 "Gotland" 10 "Blekinge" 12 "Skåne" 13 "Halland" 14 "Västra Götaland" 17 "Värmland" 18 "Örebro" 19 "Västmanland" 20 "Dalarna" 21 "Gävleborg" 22 "Västernorrland" 23 "Jämtland" 24 "Västerbotten" 25 "Norrbotten"
        label values `from' lan
        
        /* 
        Note: lan = 11, 15, 16 are not identified. Although not implemented here,
        but - it turns out that lan = 11, 15, 16 belong to lan = 14. You can just
        google "Sweden counties" and you'll find it on the Wikipedia page. 
        */
    end


program match_preg_test
	* Match preg to test if test is within 250 days of the birth date
	gen pregnancy = .
	format undersokningsdatum %td
	gen bfo_15 = mdy(bfoddat_month, 15, bfoddat_yr)
    replace pregnancy = preg_id_mfr if inrange(undersokningsdatum, birth_date - 250, birth_date)
	gen diff_bd_test = birth_date - undersokningsdatum
	gen diff_bfo_test = bfo_15 - undersokningsdatum
	gen diff_bfo_bd = bfo_15 - birth_date

	di "assert that no test is assigned two pregnancies"
	
		bysort test_id: egen mean_preg_id = mean(pregnancy)
		assert pregnancy == mean_preg_id if !mi(pregnancy)
	

	di "How many tests have a match at this stage? "
	preserve
	collapse (firstnm) pregnancy, by(test_id)
	codebook pregnancy
	restore
	
	di "Assert that there are no 'close' (between 250-270) tests we are not matching when we match a diff test" 
	bysort test_id: egen assigned_preg = max(pregnancy)
	replace assigned_preg = 1*(!mi(assigned_preg))
	
	di "------Check the 1 contradiction"
	sum test_id if diff_bd_test < 270 & diff_bd_test > 0 & assigned_preg == 1 & mi(pregnancy) 
	local test_contra = r(mean) 
	tab undersokningsdatum if test_id == `test_contra'
	tab birth_date if test_id == `test_contra'
	tab diff_bd_test if test_id == `test_contra'
	tab birth_flag if test_id == `test_contra' 
	
	di "----- drop the one contradiction because test is in 2009, so pregnancy can't be in our sample"
	drop if test_id == `test_contra' & assigned_preg == 1 & mi(pregnancy) 
	
	di "--- now run assert"
	assert diff_bd_test >= 270 | diff_bd_test < 0 if assigned_preg == 1 & mi(pregnancy) 
	
	di "For tests where we don't match the test to any preg"
	di "-- if there is a 'close' case (diff_bd_test between 250-270), and bpsms match up across datasets," 
	di "-- then create match"
	gen close_no_match = 1 if diff_bd_test < 270 & diff_bd_test > 0 & assigned_preg == 0 & mi(pregnancy)
	gen diff_bpsms = abs(bpsm_kub - bpsmdat)
	replace pregnancy = preg_id_mfr if close_no_match == 1 & diff_bpsms < 45 
	
	* Recalculate which tests have been assigned a pregnancy
	drop assigned_preg
	bysort test_id: egen assigned_preg = max(pregnancy)
	replace assigned_preg = 1*(!mi(assigned_preg))
	
	unique test_id  if assigned_preg == 0 & mi(pregnancy)
	unique test_id if assigned_preg == 1
	
	di "Count how many cases between 210 and 270 we ended up matching"
	count if inrange(diff_bd_test, 210, 270) & !mi(pregnancy) 
	di "Out of how many total tests"
	qui unique test_id 
	di r(unique)
	
	* drop extra test-preg combos for tests where we found a match
	drop if assigned_preg == 1 & mi(pregnancy)
	
	* save matched test-pregs separately to append back later
	preserve
	keep if !mi(pregnancy)
	
		* drop extra obs in cases where we see mother got 2 tests for the same pregnancy (keep later test) 
		duplicates tag pregnancy, gen(dup_preg)
		tab dup_preg
		bysort pregnancy: egen latest_test = max(undersokningsdatum)
		drop if dup_preg == 1 & undersokningsdatum != latest_test
		drop dup_preg latest_test
	
	duplicates tag test_id, gen(dup)
	assert dup == 0 // sanity check to ensure no tests have duplicates
	drop dup 
	tempfile matched_test_pregs
	save `matched_test_pregs'
	restore

	* drop extra test-preg combos for tests without a matching preg (but mom in MFR)
	drop if !mi(pregnancy)
	local mfr_varlist childid bfoddat birth_flag nonsingleton num_birth_records  ///
	  bfoddat_yr bfoddat_month baby_diagnosis_birth diff_bd_test
	drop `mfr_varlist'
	duplicates drop test_id, force
	replace in_MFR = 0 
	
	* Append back matched test-pregs
	append using `matched_test_pregs'
	
	* drop extra variables 
	drop assigned_preg diff_bpsms close_no_match
	
end

program assign_unique_pregnancy_ids
	di "assign pregnancy ID to tests that don't match to MFR"
	qui sum pregnancy, d 
	local max_preg = r(max)
	egen gr_test = group(test_id) if mi(pregnancy)
	replace pregnancy = gr_test + `max_preg' if mi(pregnancy)
	
	assert !mi(pregnancy)
end


program label_vars
    label var test_id "Test ID -- group(lopnr undersokningsdatum foster_risk)"
    label var mother_status "Mother in PregR or MFR?"
    label var lan "County (kub)"
    label var paritet "Parity"
    label var rokare "Mother smoking?"
    label var sm_datum "Date of last period"
    label var ivf_graviditet "IVF dummy"
    label var ivf_typ "IVF type"
    label var et_datum "Date of transferring fertalized egg (IVF only)"
    label var bp_enligt_et "BP date -- ET"
    label var bpsm_kub "sm_datum + 280"
    label var preg_end_date_kub "Pregnancy expected end date (kub)"
    label var undersokningsdatum "Testing date (kub)"
    label var patient_alder_dec "Patient age -- Dec prior to exam"
    label var patient_alder_vid_bpu "Patient age at expected due date"
    label var nipt_utfort "NIPT performed (kub)"
    label var foster_ul "Which fetus is the nupp_mm measuring?"
    label var nupp_mm "Neck width from ultrasound (mm)"
    label var anatomisk_bedomning_utfort "Anatomic evaluation performed (kub)"
    label var diagnos_datum "Diagnosis date"
    label var provtagningsdatum "When is blood drawn"
    label var ga_vid_v "Gestational age in full weeks (kub)"
    label var ga_vid_d "Gestational age -- days after full weeks (kub)"
    label var ga_vid_dagar "Gestational age in days (kub)"
    label var foster_risk "Which fetus is the risk score referring to?"
    label var kombinerad_risk_t21 "Risk score -- Trisomy 21"
    label var kombinerad_risk_t13_18 "Risk score -- Trisomy 13/18"
	label var procedurdatum "Invasive test date (KUB update)"
end


* Execute
main
log close
